Library
# Load library
library(tidyverse)
library(stringi)
Data
# IMDB Top 250 Lists and 5000 plus IMDB records
# https://data.world/studentoflife/imdb-top-250-lists-and-5000-or-so-data-records
df <- read.csv("https://query.data.world/s/rr46ndg7fyne54q7oonmvzxbaxg3zn", header=TRUE, stringsAsFactors=FALSE)
# View column names
colnames(df)
## [1] "Title" "Year" "Rated" "Released"
## [5] "Runtime" "Genre" "Director" "Writer"
## [9] "Actors" "Plot" "Language" "Country"
## [13] "Awards" "Poster" "Ratings.Source" "Ratings.Value"
## [17] "Metascore" "imdbRating" "imdbVotes" "imdbID"
## [21] "Type" "DVD" "BoxOffice" "Production"
## [25] "Website" "Response" "tomatoURL"
GREP & GREPL
# Find directors with Courtney in name
grep("Courtney", df$Director, value = TRUE, ignore.case = TRUE)
## [1] "Courtney Solomon" "Courtney Solomon" "Courtney Hunt"
# Find writers with Courtney in name
grep("Courtney", df$Writer, value = TRUE, ignore.case = TRUE)
## [1] "Brent Monahan (novel), Courtney Solomon"
## [2] "Irwin Yablans (story), C. Courtney Joyner (screenplay)"
## [3] "C. Courtney Joyner, Mike Malone (additional story material), Darin Scott, Jeff Burr, Mike Malone (additional story material)"
## [4] "Courtney Hunt"
# Find lists of actors where there is an actor with Courtney in their name
grep("Courtney", df$Actors, value = TRUE, ignore.case = TRUE)
## [1] "Scott 'Carrot Top' Thompson, Courtney Thorne-Smith, Larry Miller, Raquel Welch"
## [2] "Arnold Schwarzenegger, Jason Clarke, Emilia Clarke, Jai Courtney"
## [3] "Brenton Thwaites, John Samaha, Courtney Eaton, Nikolaj Coster-Waldau"
## [4] "Kate Winslet, Jai Courtney, Mekhi Phifer, Shailene Woodley"
## [5] "Bruce Willis, Jai Courtney, Sebastian Koch, Mary Elizabeth Winstead"
## [6] "Shailene Woodley, Theo James, Ashley Judd, Jai Courtney"
## [7] "Joel Courtney, Jessica Tuck, Joel McKinnon Miller, Ryan Lee"
## [8] "Sandra Bullock, Julian McMahon, Shyann McClure, Courtney Taylor Burness"
## [9] "Milla Jovovich, Brian Krause, Lisa Pelikan, Courtney Barilla"
## [10] "Elijah Wood, Courtney B. Vance, Robbie Coltrane, Jason Robards"
## [11] "Lauren German, Michael Biehn, Milo Ventimiglia, Courtney B. Vance"
## [12] "Heather Sossaman, Matthew Bohrer, Courtney Halverson, Shelley Hennig"
## [13] "Catherine Parker, Courtney Bell, Dave Levine, Justin Gordon"
# Create cast list
courtney_cast <- paste(grep("Courtney", df$Actors, value = TRUE, ignore.case = TRUE), collapse=', ') %>% str_split(", ")
courtney_cast
## [[1]]
## [1] "Scott 'Carrot Top' Thompson" "Courtney Thorne-Smith"
## [3] "Larry Miller" "Raquel Welch"
## [5] "Arnold Schwarzenegger" "Jason Clarke"
## [7] "Emilia Clarke" "Jai Courtney"
## [9] "Brenton Thwaites" "John Samaha"
## [11] "Courtney Eaton" "Nikolaj Coster-Waldau"
## [13] "Kate Winslet" "Jai Courtney"
## [15] "Mekhi Phifer" "Shailene Woodley"
## [17] "Bruce Willis" "Jai Courtney"
## [19] "Sebastian Koch" "Mary Elizabeth Winstead"
## [21] "Shailene Woodley" "Theo James"
## [23] "Ashley Judd" "Jai Courtney"
## [25] "Joel Courtney" "Jessica Tuck"
## [27] "Joel McKinnon Miller" "Ryan Lee"
## [29] "Sandra Bullock" "Julian McMahon"
## [31] "Shyann McClure" "Courtney Taylor Burness"
## [33] "Milla Jovovich" "Brian Krause"
## [35] "Lisa Pelikan" "Courtney Barilla"
## [37] "Elijah Wood" "Courtney B. Vance"
## [39] "Robbie Coltrane" "Jason Robards"
## [41] "Lauren German" "Michael Biehn"
## [43] "Milo Ventimiglia" "Courtney B. Vance"
## [45] "Heather Sossaman" "Matthew Bohrer"
## [47] "Courtney Halverson" "Shelley Hennig"
## [49] "Catherine Parker" "Courtney Bell"
## [51] "Dave Levine" "Justin Gordon"
# Convert to character vector
courtney_cast <- courtney_cast %>% unlist()
courtney_cast
## [1] "Scott 'Carrot Top' Thompson" "Courtney Thorne-Smith"
## [3] "Larry Miller" "Raquel Welch"
## [5] "Arnold Schwarzenegger" "Jason Clarke"
## [7] "Emilia Clarke" "Jai Courtney"
## [9] "Brenton Thwaites" "John Samaha"
## [11] "Courtney Eaton" "Nikolaj Coster-Waldau"
## [13] "Kate Winslet" "Jai Courtney"
## [15] "Mekhi Phifer" "Shailene Woodley"
## [17] "Bruce Willis" "Jai Courtney"
## [19] "Sebastian Koch" "Mary Elizabeth Winstead"
## [21] "Shailene Woodley" "Theo James"
## [23] "Ashley Judd" "Jai Courtney"
## [25] "Joel Courtney" "Jessica Tuck"
## [27] "Joel McKinnon Miller" "Ryan Lee"
## [29] "Sandra Bullock" "Julian McMahon"
## [31] "Shyann McClure" "Courtney Taylor Burness"
## [33] "Milla Jovovich" "Brian Krause"
## [35] "Lisa Pelikan" "Courtney Barilla"
## [37] "Elijah Wood" "Courtney B. Vance"
## [39] "Robbie Coltrane" "Jason Robards"
## [41] "Lauren German" "Michael Biehn"
## [43] "Milo Ventimiglia" "Courtney B. Vance"
## [45] "Heather Sossaman" "Matthew Bohrer"
## [47] "Courtney Halverson" "Shelley Hennig"
## [49] "Catherine Parker" "Courtney Bell"
## [51] "Dave Levine" "Justin Gordon"
# Filter out cast list to only keep actors with Courtney in their name
courtney_actors <- grep("Courtney", courtney_cast, value = TRUE, ignore.case = TRUE) %>% unique()
courtney_actors
## [1] "Courtney Thorne-Smith" "Jai Courtney"
## [3] "Courtney Eaton" "Joel Courtney"
## [5] "Courtney Taylor Burness" "Courtney Barilla"
## [7] "Courtney B. Vance" "Courtney Halverson"
## [9] "Courtney Bell"
# Find actors with Courtney as first name
grep("^Courtney", courtney_actors, value = TRUE, ignore.case = TRUE)
## [1] "Courtney Thorne-Smith" "Courtney Eaton"
## [3] "Courtney Taylor Burness" "Courtney Barilla"
## [5] "Courtney B. Vance" "Courtney Halverson"
## [7] "Courtney Bell"
# Find actors with Courtney as last name
grep("Courtney$", courtney_actors, value = TRUE, ignore.case = TRUE)
## [1] "Jai Courtney" "Joel Courtney"
# Pull all actors
actors <- df$Actors %>% str_split(", ") %>% unlist() %>% str_trim() %>% unique()
actors %>% head(20)
## [1] "Cem Kurtoglu" "Hakan Ural" "Hazim Körmükçü"
## [4] "Tolga Karel" "Kirk Cameron" "Darren Doane"
## [7] "Bridgette Cameron" "Ben Kientz" "Jon Voight"
## [10] "Scott Baio" "Vanessa Angel" "Skyler Shaye"
## [13] "Daniel Küblböck" "Ulli Lommel" "Rudolf Waldemar Brem"
## [16] "Katja Rupé" "Tom Neyman" "John Reynolds"
## [19] "Diane Adelson" "Harold P. Warren"
# Try to find actors whose first name starts with C and end with Y, we can see there's a few instances where the first name doesn't actually end with Y (Caleb Landry Jones & Charles Henry Wyson )
actors[(grepl("^C", actors, ignore.case = TRUE) & grepl("y\\s", actors, ignore.case = TRUE)) & !grepl("C\\.", actors, ignore.case = TRUE)] %>% sort()
## [1] "Caity Lotz" "Caleb Landry Jones"
## [3] "Caley Hayes" "Candy Ford"
## [5] "Carey Crim" "Carey Lowell"
## [7] "Carey Means" "Carey Mulligan"
## [9] "Carly Nahon" "Carly Schroeder"
## [11] "Cary Elwes" "Cary Grant"
## [13] "Casey Affleck" "Casey Dubois"
## [15] "Casey Fallo" "Casey Gooden"
## [17] "Casey Groves" "Casey Hooper"
## [19] "Casey Twenter" "Cassidy Gifford"
## [21] "Cathy Meils" "Cathy Moriarty"
## [23] "Chaney Kley" "Charles Henry Wyson"
## [25] "Charley Grapewin" "Chauncey Leopardi"
## [27] "Chelsey Reist" "Cherry Jones"
## [29] "Chevy Chase" "Christy Chung"
## [31] "Christy Lighthouse" "Cicely Tyson"
## [33] "Cindy Butler" "Cindy Karr"
## [35] "Cindy Manion" "Clancy Brown"
## [37] "Clemency Burton-Hill" "Cody Horn"
## [39] "Cody Howard" "Cody Linley"
## [41] "Cody McMains" "Colby French"
## [43] "Corey Burton" "Corey Feldman"
## [45] "Corey Haim" "Corey Hawkins"
## [47] "Corey Johnson" "Corey Moosa"
## [49] "Corey Sevier" "Corey Stoll"
## [51] "Cory Fernandez" "Cory Hardrict"
## [53] "Cory Hodges" "Cory Monteith"
## [55] "Courteney Cox" "Courtney B. Vance"
## [57] "Courtney Barilla" "Courtney Bell"
## [59] "Courtney Eaton" "Courtney Halverson"
## [61] "Courtney Taylor Burness" "Courtney Thorne-Smith"
# Find actors whose last name starts with C and end with Y, we can see for example Larry the Cable Guy & John C. McGinley don't actually start with C
actors[(grepl("\\sC", actors, ignore.case = TRUE) & grepl("y$", actors, ignore.case = TRUE))]
## [1] "Jonathan Cherry" "Mariah Carey" "Bill Cosby"
## [4] "Tom Courtenay" "Kevin Casey" "John C. McGinley"
## [7] "Wendell Corey" "Jennifer Connelly" "Jim Carrey"
## [10] "Sean Connery" "Christopher Carley" "Walter Connolly"
## [13] "Joyce Carey" "Mithun Chakraborty" "Alok Chakravarty"
## [16] "Raj Singh Chaudhary" "Tota Roy Chowdhury" "Sabyasachi Chakraborty"
## [19] "Aashish Chaudhary" "Mahima Chaudhry" "Larry the Cable Guy"
## [22] "Billy Connolly" "Sharlto Copley" "George Clooney"
## [25] "Raffey Cassidy" "John C. Reilly" "Jai Courtney"
## [28] "Jim Conroy" "Art Carney" "Henry Czerny"
## [31] "Tim Conway" "Joel Courtney" "Tim Curry"
## [34] "Chandler Canterbury" "Jonathan Chan-Pensley" "Andrew Dice Clay"
## [37] "Rob Corddry" "Reeve Carney" "Emma Cleasby"
## [40] "Kristen Connolly" "Chris Coy" "Cab Calloway"
## [43] "Darryl Cooksey" "John Candy" "Annie Corley"
## [46] "Frances Conroy" "Dana Carvey" "Katie Cassidy"
## [49] "Jarlath Conroy" "Anna Chlumsky" "Elaine Cassidy"
## [52] "Ellie Chidzey" "Anne Consigny" "Jack Conley"
## [55] "Julia Chantrey" "Denise Crosby" "Don 'D.C.' Curry"
## [58] "Julius Carry" "Babou Ceesay" "Mary Crosby"
## [61] "Matt Czuchry" "Jeff Conaway" "Chloe Csengery"
## [64] "Reg E. Cathey" "Sarita Choudhury" "David Connolly"
## [67] "Noam Chomsky" "Christopher Curry" "Jake Cherry"
## [70] "Robert Clohessy" "Joanna Cassidy" "Gary Conway"
## [73] "Katie Cleary"
# Find actors whose first OR last name starts with C and end with Y, issues above still exist
actors[(grepl("^C", actors, ignore.case = TRUE) & grepl("y\\s", actors, ignore.case = TRUE) & !grepl("C\\.", actors, ignore.case = TRUE)) | (grepl("\\sC", actors, ignore.case = TRUE) & grepl("y$", actors, ignore.case = TRUE))] %>% sort()
## [1] "Aashish Chaudhary" "Alok Chakravarty"
## [3] "Andrew Dice Clay" "Anna Chlumsky"
## [5] "Anne Consigny" "Annie Corley"
## [7] "Art Carney" "Babou Ceesay"
## [9] "Bill Cosby" "Billy Connolly"
## [11] "Cab Calloway" "Caity Lotz"
## [13] "Caleb Landry Jones" "Caley Hayes"
## [15] "Candy Ford" "Carey Crim"
## [17] "Carey Lowell" "Carey Means"
## [19] "Carey Mulligan" "Carly Nahon"
## [21] "Carly Schroeder" "Cary Elwes"
## [23] "Cary Grant" "Casey Affleck"
## [25] "Casey Dubois" "Casey Fallo"
## [27] "Casey Gooden" "Casey Groves"
## [29] "Casey Hooper" "Casey Twenter"
## [31] "Cassidy Gifford" "Cathy Meils"
## [33] "Cathy Moriarty" "Chandler Canterbury"
## [35] "Chaney Kley" "Charles Henry Wyson"
## [37] "Charley Grapewin" "Chauncey Leopardi"
## [39] "Chelsey Reist" "Cherry Jones"
## [41] "Chevy Chase" "Chloe Csengery"
## [43] "Chris Coy" "Christopher Carley"
## [45] "Christopher Curry" "Christy Chung"
## [47] "Christy Lighthouse" "Cicely Tyson"
## [49] "Cindy Butler" "Cindy Karr"
## [51] "Cindy Manion" "Clancy Brown"
## [53] "Clemency Burton-Hill" "Cody Horn"
## [55] "Cody Howard" "Cody Linley"
## [57] "Cody McMains" "Colby French"
## [59] "Corey Burton" "Corey Feldman"
## [61] "Corey Haim" "Corey Hawkins"
## [63] "Corey Johnson" "Corey Moosa"
## [65] "Corey Sevier" "Corey Stoll"
## [67] "Cory Fernandez" "Cory Hardrict"
## [69] "Cory Hodges" "Cory Monteith"
## [71] "Courteney Cox" "Courtney B. Vance"
## [73] "Courtney Barilla" "Courtney Bell"
## [75] "Courtney Eaton" "Courtney Halverson"
## [77] "Courtney Taylor Burness" "Courtney Thorne-Smith"
## [79] "Dana Carvey" "Darryl Cooksey"
## [81] "David Connolly" "Denise Crosby"
## [83] "Don 'D.C.' Curry" "Elaine Cassidy"
## [85] "Ellie Chidzey" "Emma Cleasby"
## [87] "Frances Conroy" "Gary Conway"
## [89] "George Clooney" "Henry Czerny"
## [91] "Jack Conley" "Jai Courtney"
## [93] "Jake Cherry" "Jarlath Conroy"
## [95] "Jeff Conaway" "Jennifer Connelly"
## [97] "Jim Carrey" "Jim Conroy"
## [99] "Joanna Cassidy" "Joel Courtney"
## [101] "John C. McGinley" "John C. Reilly"
## [103] "John Candy" "Jonathan Chan-Pensley"
## [105] "Jonathan Cherry" "Joyce Carey"
## [107] "Julia Chantrey" "Julius Carry"
## [109] "Katie Cassidy" "Katie Cleary"
## [111] "Kevin Casey" "Kristen Connolly"
## [113] "Larry the Cable Guy" "Mahima Chaudhry"
## [115] "Mariah Carey" "Mary Crosby"
## [117] "Matt Czuchry" "Mithun Chakraborty"
## [119] "Noam Chomsky" "Raffey Cassidy"
## [121] "Raj Singh Chaudhary" "Reeve Carney"
## [123] "Reg E. Cathey" "Rob Corddry"
## [125] "Robert Clohessy" "Sabyasachi Chakraborty"
## [127] "Sarita Choudhury" "Sean Connery"
## [129] "Sharlto Copley" "Tim Conway"
## [131] "Tim Curry" "Tom Courtenay"
## [133] "Tota Roy Chowdhury" "Walter Connolly"
## [135] "Wendell Corey"
# Split actors' names into list
actors_list <- actors %>% str_split(" ")
# Convert to matrix
actors_matrix <- stri_list2matrix(actors_list, byrow=TRUE)
# Convert to data frame, set name columns
actors_df <- as.data.frame(actors_matrix)
colnames(actors_df) <- c("name1", "name2", "name3", "name4")
# Create column with name re-joined
actors_df$name <- paste(coalesce(actors_df$name1, ""), coalesce(actors_df$name2, ""), coalesce(actors_df$name3, ""), coalesce(actors_df$name4, ""), sep = " ") %>% str_trim()
# View data frame
actors_df[order(actors_df$name1),] %>% head(20)
## name1 name2 name3 name4 name
## 7194 'Weird Al' Yankovic <NA> 'Weird Al' Yankovic
## 3050 50 Cent <NA> <NA> 50 Cent
## 4089 A. Russell Andrews <NA> A. Russell Andrews
## 7811 A.D. Miles <NA> <NA> A.D. Miles
## 4480 A.J. Cook <NA> <NA> A.J. Cook
## 4726 A.J. Langer <NA> <NA> A.J. Langer
## 8032 A.J. Buckley <NA> <NA> A.J. Buckley
## 9826 A.J. DeLucia <NA> <NA> A.J. DeLucia
## 3975 Aaliyah <NA> <NA> <NA> Aaliyah
## 1161 Aamir Khan <NA> <NA> Aamir Khan
## 3343 Aaran Thomas <NA> <NA> Aaran Thomas
## 402 Aaron Eckhart <NA> <NA> Aaron Eckhart
## 2164 Aaron Paul <NA> <NA> Aaron Paul
## 3101 Aaron Kwok <NA> <NA> Aaron Kwok
## 3282 Aaron Taylor-Johnson <NA> <NA> Aaron Taylor-Johnson
## 3920 Aaron Yoo <NA> <NA> Aaron Yoo
## 4677 Aaron Murphy <NA> <NA> Aaron Murphy
## 9345 Aaron Ruell <NA> <NA> Aaron Ruell
## 9717 Aaron Stanford <NA> <NA> Aaron Stanford
## 8812 Aasheekaa Bathija <NA> <NA> Aasheekaa Bathija
# Check each individual name for pattern
actors_df$name1_match <- grepl("C*y$", actors_df$name1, ignore.case = TRUE) & grepl("^C", actors_df$name1, ignore.case = TRUE)
actors_df$name2_match <- grepl("C*y$", actors_df$name2, ignore.case = TRUE) & grepl("^C", actors_df$name2, ignore.case = TRUE)
actors_df$name3_match <- grepl("C*y$", actors_df$name3, ignore.case = TRUE) & grepl("^C", actors_df$name3, ignore.case = TRUE)
actors_df$name4_match <- grepl("C*y$", actors_df$name4, ignore.case = TRUE) & grepl("^C", actors_df$name4, ignore.case = TRUE)
# Filter to keep matching names
actors_df <- actors_df %>% filter(name1_match == TRUE | name2_match == TRUE | name3_match == TRUE | name4_match == TRUE)
# View names, we see those outliers are removed below
actors_df$name %>% sort() %>% unique()
## [1] "Aashish Chaudhary" "Alok Chakravarty"
## [3] "Andrew Dice Clay" "Anna Chlumsky"
## [5] "Anne Consigny" "Annie Corley"
## [7] "Art Carney" "Babou Ceesay"
## [9] "Bill Cosby" "Billy Connolly"
## [11] "Cab Calloway" "Caity Lotz"
## [13] "Caley Hayes" "Candy Ford"
## [15] "Carey Crim" "Carey Lowell"
## [17] "Carey Means" "Carey Mulligan"
## [19] "Carly Nahon" "Carly Schroeder"
## [21] "Cary Elwes" "Cary Grant"
## [23] "Casey Affleck" "Casey Dubois"
## [25] "Casey Fallo" "Casey Gooden"
## [27] "Casey Groves" "Casey Hooper"
## [29] "Casey Twenter" "Cassidy Gifford"
## [31] "Cathy Meils" "Cathy Moriarty"
## [33] "Chandler Canterbury" "Chaney Kley"
## [35] "Charley Grapewin" "Chauncey Leopardi"
## [37] "Chelsey Reist" "Cherry Jones"
## [39] "Chevy Chase" "Chloe Csengery"
## [41] "Chris Coy" "Christopher Carley"
## [43] "Christopher Curry" "Christy Chung"
## [45] "Christy Lighthouse" "Cicely Tyson"
## [47] "Cindy Butler" "Cindy Karr"
## [49] "Cindy Manion" "Clancy Brown"
## [51] "Clemency Burton-Hill" "Cody Horn"
## [53] "Cody Howard" "Cody Linley"
## [55] "Cody McMains" "Colby French"
## [57] "Corey Burton" "Corey Feldman"
## [59] "Corey Haim" "Corey Hawkins"
## [61] "Corey Johnson" "Corey Moosa"
## [63] "Corey Sevier" "Corey Stoll"
## [65] "Cory Fernandez" "Cory Hardrict"
## [67] "Cory Hodges" "Cory Monteith"
## [69] "Courteney Cox" "Courtney B. Vance"
## [71] "Courtney Barilla" "Courtney Bell"
## [73] "Courtney Eaton" "Courtney Halverson"
## [75] "Courtney Taylor Burness" "Courtney Thorne-Smith"
## [77] "Dana Carvey" "Darryl Cooksey"
## [79] "David Connolly" "Denise Crosby"
## [81] "Don 'D.C.' Curry" "Elaine Cassidy"
## [83] "Ellie Chidzey" "Emma Cleasby"
## [85] "Frances Conroy" "Gary Conway"
## [87] "George Clooney" "Henry Czerny"
## [89] "Jack Conley" "Jai Courtney"
## [91] "Jake Cherry" "Jarlath Conroy"
## [93] "Jeff Conaway" "Jennifer Connelly"
## [95] "Jim Carrey" "Jim Conroy"
## [97] "Joanna Cassidy" "Joel Courtney"
## [99] "John Candy" "Jonathan Chan-Pensley"
## [101] "Jonathan Cherry" "Joyce Carey"
## [103] "Julia Chantrey" "Julius Carry"
## [105] "Katie Cassidy" "Katie Cleary"
## [107] "Kevin Casey" "Kristen Connolly"
## [109] "Mahima Chaudhry" "Mariah Carey"
## [111] "Mary Crosby" "Matt Czuchry"
## [113] "Mithun Chakraborty" "Noam Chomsky"
## [115] "Raffey Cassidy" "Raj Singh Chaudhary"
## [117] "Reeve Carney" "Reg E. Cathey"
## [119] "Rob Corddry" "Robert Clohessy"
## [121] "Sabyasachi Chakraborty" "Sarita Choudhury"
## [123] "Sean Connery" "Sharlto Copley"
## [125] "Tim Conway" "Tim Curry"
## [127] "Tom Courtenay" "Tota Roy Chowdhury"
## [129] "Walter Connolly" "Wendell Corey"
GSUB & SUB
# Find actors with quotation mark (') surrounding name
# https://javascript.info/regexp-greedy-and-lazy
# https://www.rexegg.com/regex-quantifiers.php
# Greedy gives longest match, lazy gives shortest match, here we return all values that have a match
grep("'.*?'", actors, value = TRUE, ignore.case = TRUE)
## [1] "Gary 'G. Thang' Johnson" "Scott 'Carrot Top' Thompson"
## [3] "Cesáreo Quezadas 'Pulgarcito'" "José Luis Aguirre 'Trotsky'"
## [5] "Tung Cho 'Joe' Cheung" "Oliver 'Ole' Zemen"
## [7] "Michael 'Xeno' Langebeck" "Joanna 'JoJo' Levesque"
## [9] "Don 'D.C.' Curry" "Eddie 'Piolin' Sotelo"
## [11] "'Weird Al' Yankovic" "George 'Buck' Flower"
## [13] "Tommy 'Tiny' Lister" "Will 'Spank' Horton"
## [15] "Mike 'The Miz' Mizanin" "Yousef 'Joe' Sweid"
## [17] "Stephanie 'Stevvi' Alexander" "Julie 'Jules' Urich"
## [19] "Wilbur 'Hi-Fi' White" "Chris 'Wonder' Schoeck"
quotation_actors <- grep("'.*?'", actors, value = TRUE, ignore.case = TRUE)
# Replaces all
gsub("'", "(", quotation_actors)
## [1] "Gary (G. Thang( Johnson" "Scott (Carrot Top( Thompson"
## [3] "Cesáreo Quezadas (Pulgarcito(" "José Luis Aguirre (Trotsky("
## [5] "Tung Cho (Joe( Cheung" "Oliver (Ole( Zemen"
## [7] "Michael (Xeno( Langebeck" "Joanna (JoJo( Levesque"
## [9] "Don (D.C.( Curry" "Eddie (Piolin( Sotelo"
## [11] "(Weird Al( Yankovic" "George (Buck( Flower"
## [13] "Tommy (Tiny( Lister" "Will (Spank( Horton"
## [15] "Mike (The Miz( Mizanin" "Yousef (Joe( Sweid"
## [17] "Stephanie (Stevvi( Alexander" "Julie (Jules( Urich"
## [19] "Wilbur (Hi-Fi( White" "Chris (Wonder( Schoeck"
# Replaces the first one
sub("'", "(", quotation_actors)
## [1] "Gary (G. Thang' Johnson" "Scott (Carrot Top' Thompson"
## [3] "Cesáreo Quezadas (Pulgarcito'" "José Luis Aguirre (Trotsky'"
## [5] "Tung Cho (Joe' Cheung" "Oliver (Ole' Zemen"
## [7] "Michael (Xeno' Langebeck" "Joanna (JoJo' Levesque"
## [9] "Don (D.C.' Curry" "Eddie (Piolin' Sotelo"
## [11] "(Weird Al' Yankovic" "George (Buck' Flower"
## [13] "Tommy (Tiny' Lister" "Will (Spank' Horton"
## [15] "Mike (The Miz' Mizanin" "Yousef (Joe' Sweid"
## [17] "Stephanie (Stevvi' Alexander" "Julie (Jules' Urich"
## [19] "Wilbur (Hi-Fi' White" "Chris (Wonder' Schoeck"
# Replace first
parenthesis_actors <- sub("'", "(", quotation_actors)
# Replace second
parenthesis_actors<- sub("'", ")", parenthesis_actors)
parenthesis_actors
## [1] "Gary (G. Thang) Johnson" "Scott (Carrot Top) Thompson"
## [3] "Cesáreo Quezadas (Pulgarcito)" "José Luis Aguirre (Trotsky)"
## [5] "Tung Cho (Joe) Cheung" "Oliver (Ole) Zemen"
## [7] "Michael (Xeno) Langebeck" "Joanna (JoJo) Levesque"
## [9] "Don (D.C.) Curry" "Eddie (Piolin) Sotelo"
## [11] "(Weird Al) Yankovic" "George (Buck) Flower"
## [13] "Tommy (Tiny) Lister" "Will (Spank) Horton"
## [15] "Mike (The Miz) Mizanin" "Yousef (Joe) Sweid"
## [17] "Stephanie (Stevvi) Alexander" "Julie (Jules) Urich"
## [19] "Wilbur (Hi-Fi) White" "Chris (Wonder) Schoeck"
Finding Numbers
# Find numbers
grep("[0-9]", df$Title, value = TRUE, ignore.case = TRUE) %>% head(20)
## [1] "Superbabies: Baby Geniuses 2"
## [2] "Birdemic 2: The Resurrection"
## [3] "Dracula 3000"
## [4] "Leonard Part 6"
## [5] "Kyaa Kool Hain Hum 3"
## [6] "Lawnmower Man 2: Beyond Cyberspace"
## [7] "Car 54, Where Are You?"
## [8] "12 Angry Men"
## [9] "Se7en"
## [10] "Terminator 2: Judgment Day"
## [11] "Toy Story 3"
## [12] "2001: A Space Odyssey"
## [13] "Kill Bill: Vol. 1"
## [14] "The Legend of 1900"
## [15] "12 Years a Slave"
## [16] "Harry Potter and the Deathly Hallows: Part 2"
## [17] "Guardians of the Galaxy Vol. 2"
## [18] "Stalag 17"
## [19] "Short Term 12"
## [20] "3 Idiots"
# Replace non-digits with empty string, format digits as numbers
as.numeric(gsub("\\D", "", df$Title))[!is.na(as.numeric(gsub("\\D", "", df$Title)))] %>% unique()
## [1] 2 3000 6 3 54 12 7 2001 1 1900 17 26
## [13] 24 420 20 9211 10 21 400 8 2012 47 4 13
## [25] 300 123 80 19 102 2000 22 50 571 49 51 44
## [37] 310 16 28 15 42 5 40 23 1941 9 27 1330
## [49] 3313 30 88 1911 39 33 1408 93 127 4040 55 2046
## [61] 911 43 1974 60 40000 5050 500 46 1000 200 1114 786
## [73] 100 20000 18 90 25 1776 213 1984 72 2016 81 247
## [85] 66 41 132 1838 1982 432 70
grep("\\sone\\s|\\II|\\sthree\\s|\\sIV\\s", df$Title, value = TRUE, ignore.case = TRUE)
## [1] "Dark Harvest II: The Maize"
## [2] "Boggy Creek II: And the Legend Continues"
## [3] "The Godfather: Part II"
## [4] "Star Wars: Episode IV - A New Hope"
## [5] "It Happened One Night"
## [6] "No One Killed Jessica"
## [7] "Mission: Impossible III"
## [8] "Men in Black II"
## [9] "Bad Boys II"
## [10] "Harry Potter and the Deathly Hallows: Part II"
## [11] "Mission: Impossible II"
## [12] "Star Wars: Episode III - Revenge of the Sith"
## [13] "Star Wars: Episode II - Attack of the Clones"
## [14] "Jurassic Park III"
## [15] "The Three Musketeers"
## [16] "Nutty Professor II: The Klumps"
## [17] "Hellboy II: The Golden Army"
## [18] "The Hangover Part II"
## [19] "Rambo III"
## [20] "Superman II"
## [21] "Blade II"
## [22] "The Godfather: Part III"
## [23] "Beverly Hills Cop III"
## [24] "Rambo: First Blood Part II"
## [25] "Back to the Future Part II"
## [26] "Back to the Future Part III"
## [27] "Superman III"
## [28] "The Three Stooges"
## [29] "The Next Three Days"
## [30] "Teenage Mutant Ninja Turtles II: The Secret of the Ooze"
## [31] "Jeepers Creepers II"
## [32] "Teenage Mutant Ninja Turtles III"
## [33] "Beverly Hills Cop II"
## [34] "Les couloirs du temps: Les visiteurs II"
## [35] "Star Trek III: The Search for Spock"
## [36] "Halloween II"
## [37] "The Hills Have Eyes II"
## [38] "Crocodile Dundee II"
## [39] "Exorcist II: The Heretic"
## [40] "Star Trek II: The Wrath of Khan"
## [41] "A Tale of Three Cities"
## [42] "Saw III"
## [43] "Atlas Shrugged II: The Strike"
## [44] "Poltergeist III"
## [45] "Richard III"
## [46] "The Boondock Saints II: All Saints Day"
## [47] "Hostel: Part II"
## [48] "The Work and the Glory II: American Zion"
## [49] "Saw II"
## [50] "Clerks II"
## [51] "Friday the 13th Part VIII: Jason Takes Manhattan"
## [52] "The Last Exorcism Part II"
## [53] "Menace II Society"
## [54] "Evil Dead II"
## [55] "Phantasm II"
## [56] "Thirteen Conversations About One Thing"
## [57] "Friday the 13th Part VII: The New Blood"
## [58] "Halloween III: Season of the Witch"
## [59] "The Toxic Avenger Part II"
## [60] "Friday the 13th Part III"
## [61] "Return of the Living Dead III"
Finding Punctuation
grep("[[:punct:]]", df$Title, value = TRUE, ignore.case = TRUE) %>% head(30)
## [1] "Code Name: K.O.Z."
## [2] "Superbabies: Baby Geniuses 2"
## [3] "Manos: The Hands of Fate"
## [4] "Pledge This!"
## [5] "Foodfight!"
## [6] "Birdemic: Shock and Terror"
## [7] "Dream.net"
## [8] "Titanic: The Legend Goes On..."
## [9] "The Hottie & the Nottie"
## [10] "Keloglan vs. the Black Prince"
## [11] "A Fox's Tale"
## [12] "Ram Gopal Varma's Indian Flames"
## [13] "Ben & Arthur"
## [14] "Birdemic 2: The Resurrection"
## [15] "Monster a-Go Go"
## [16] "Who's Your Caddy?"
## [17] "Anne B. Real"
## [18] "The Incredibly Strange Creatures Who Stopped Living and Became Mixed-Up Zombies!!?"
## [19] "Lawnmower Man 2: Beyond Cyberspace"
## [20] "Dark Harvest II: The Maize"
## [21] "Boggy Creek II: And the Legend Continues"
## [22] "Car 54, Where Are You?"
## [23] "Dragonball: Evolution"
## [24] "It's Pat: The Movie"
## [25] "Ghosts Can't Do It"
## [26] "The Godfather: Part II"
## [27] "Schindler's List"
## [28] "The Lord of the Rings: The Return of the King"
## [29] "The Lord of the Rings: The Fellowship of the Ring"
## [30] "Star Wars: Episode V - The Empire Strikes Back"
# Find questions
grep("*\\?", df$Title, value = TRUE, ignore.case = TRUE)
## [1] "Who's Your Caddy?"
## [2] "The Incredibly Strange Creatures Who Stopped Living and Became Mixed-Up Zombies!!?"
## [3] "Car 54, Where Are You?"
## [4] "What Ever Happened to Baby Jane?"
## [5] "Who's Afraid of Virginia Woolf?"
## [6] "Did You Hear About the Morgans?"
## [7] "What Planet Are You From?"
## [8] "What's the Worst That Could Happen?"
## [9] "O Brother, Where Art Thou?"
## [10] "Are We There Yet?"
## [11] "What's Your Number?"
## [12] "Why Did I Get Married?"
## [13] "Dude, Where's My Car?"
## [14] "When Did You Last See Your Father?"
## [15] "Atlas Shrugged: Who Is John Galt?"
## [16] "What the #$*! Do We (K)now!?"
## [17] "Do You Believe?"
## [18] "Who Killed the Electric Car?"
## [19] "Dude, Where's My Dog?!"
# Find questions
grep("\\?$", df$Title, value = TRUE, ignore.case = TRUE)
## [1] "Who's Your Caddy?"
## [2] "The Incredibly Strange Creatures Who Stopped Living and Became Mixed-Up Zombies!!?"
## [3] "Car 54, Where Are You?"
## [4] "What Ever Happened to Baby Jane?"
## [5] "Who's Afraid of Virginia Woolf?"
## [6] "Did You Hear About the Morgans?"
## [7] "What Planet Are You From?"
## [8] "What's the Worst That Could Happen?"
## [9] "O Brother, Where Art Thou?"
## [10] "Are We There Yet?"
## [11] "What's Your Number?"
## [12] "Why Did I Get Married?"
## [13] "Dude, Where's My Car?"
## [14] "When Did You Last See Your Father?"
## [15] "Atlas Shrugged: Who Is John Galt?"
## [16] "What the #$*! Do We (K)now!?"
## [17] "Do You Believe?"
## [18] "Who Killed the Electric Car?"
Finding word frequency
# Transform title
title_list <- df$Title %>% str_replace_all("[[:punct:]]", "") %>% strsplit(" ") %>% unlist() %>% str_to_lower()
# Find frequencies
title_list_freq <- table(title_list) %>% list()
# Convert to data frame
title_df <- data.frame(title_list_freq)
colnames(title_df) <- c("word", "freq")
# View df
title_df %>% head(10)
## word freq
## 1 77
## 2 $ 1
## 3 + 2
## 4 1 6
## 5 10 5
## 6 1000 1
## 7 102 1
## 8 10th 1
## 9 11 1
## 10 1114 1
# Find most frequent words
title_df %>% filter(word != "") %>% arrange(desc(freq)) %>% head(10)
## word freq
## 1 the 1617
## 2 of 508
## 3 a 197
## 4 and 153
## 5 in 129
## 6 2 112
## 7 to 111
## 8 man 69
## 9 love 59
## 10 on 55
# Find first words
titles <- df$Title %>% strsplit(" ")
length(titles)
## [1] 5273
for ( i in (1:length(titles)) ) {
if (i == 1) {
first_words <- c(titles[[i]][1])
} else {
first_words <- append(first_words, titles[[i]][1])
}
}
first_words %>% head(10)
## [1] "Code" "Saving" "Superbabies:" "Daniel" "Manos:"
## [6] "Pledge" "Turks" "Foodfight!" "Birdemic:" "Dream.net"
# Find first word frequency
first_words_freq <- first_words %>% str_replace_all(pattern = "[:punct:]", replacement = "") %>% unlist() %>% str_to_lower() %>% table()
# Convert to data frame
first_words_df <- data.frame(first_words_freq)
colnames(first_words_df) <- c("word", "freq")
first_words_df %>% filter(word != "") %>% arrange(desc(freq)) %>% head(10)
## word freq
## 1 the 1010
## 2 a 84
## 3 my 34
## 4 i 31
## 5 in 21
## 6 star 19
## 7 american 18
## 8 all 16
## 9 red 16
## 10 black 15
# Find last words
titles <- df$Title %>% strsplit(" ")
length(titles)
## [1] 5273
for (i in (1:length(titles))) {
if (i == 1) {
last_words <- c(titles[[i]][length(titles[[i]])])
} else {
last_words <- append(last_words, titles[[i]][length(titles[[i]])])
}
}
last_words %>% head(10)
## [1] "K.O.Z." "Christmas" "2" "Zauberer" "Fate"
## [6] "This!" "Space" "Foodfight!" "Terror" "Dream.net"
# Find last word frequency
last_words_freq <- last_words %>% str_replace_all(pattern = "[:punct:]", replacement = "") %>% unlist() %>% str_to_lower() %>% table()
# Convert to data frame
last_words_df <- data.frame(last_words_freq)
colnames(last_words_df) <- c("word", "freq")
last_words_df %>% filter(word != "") %>% arrange(desc(freq)) %>% head(20)
## word freq
## 1 2 79
## 2 man 42
## 3 movie 38
## 4 love 24
## 5 ii 23
## 6 me 22
## 7 story 22
## 8 3 20
## 9 day 20
## 10 dead 20
## 11 girl 18
## 12 men 17
## 13 house 16
## 14 world 15
## 15 you 15
## 16 it 14
## 17 iii 13
## 18 war 13
## 19 boy 12
## 20 days 12